{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "307804a3-c02b-4a57-ac0d-172c30ddc851",
   "metadata": {},
   "source": [
    "# Chroma Vector Store"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f7010b1d-d1bb-4f08-9309-a328bb4ea396",
   "metadata": {},
   "source": [
    "#### Creating a Chroma Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d48af8e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import sys\n",
    "\n",
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0ce3143d-198c-4dd2-8e5a-c5cdf94f017a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import chromadb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "667f3cb3-ce18-48d5-b9aa-bfc1a1f0f0f6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
      "Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
      "Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
      "INFO:chromadb:Running Chroma using direct local API.\n",
      "Running Chroma using direct local API.\n",
      "Running Chroma using direct local API.\n",
      "INFO:numexpr.utils:Note: NumExpr detected 12 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
      "Note: NumExpr detected 12 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
      "Note: NumExpr detected 12 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
      "INFO:numexpr.utils:NumExpr defaulting to 8 threads.\n",
      "NumExpr defaulting to 8 threads.\n",
      "NumExpr defaulting to 8 threads.\n",
      "WARNING:chromadb:Using embedded DuckDB without persistence: data will be transient\n",
      "Using embedded DuckDB without persistence: data will be transient\n",
      "Using embedded DuckDB without persistence: data will be transient\n",
      "INFO:clickhouse_connect.driver.ctypes:Successfully imported ClickHouse Connect C data optimizations\n",
      "Successfully imported ClickHouse Connect C data optimizations\n",
      "Successfully imported ClickHouse Connect C data optimizations\n",
      "INFO:clickhouse_connect.driver.ctypes:Successfully import ClickHouse Connect C/Numpy optimizations\n",
      "Successfully import ClickHouse Connect C/Numpy optimizations\n",
      "Successfully import ClickHouse Connect C/Numpy optimizations\n",
      "INFO:clickhouse_connect.json_impl:Using python library for writing JSON byte strings\n",
      "Using python library for writing JSON byte strings\n",
      "Using python library for writing JSON byte strings\n",
      "WARNING:chromadb.api.models.Collection:No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction\n",
      "No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction\n",
      "No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/suo/miniconda3/envs/llama/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
      "Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
      "Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n",
      "INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu\n",
      "Use pytorch device: cpu\n",
      "Use pytorch device: cpu\n"
     ]
    }
   ],
   "source": [
    "chroma_client = chromadb.Client()\n",
    "chroma_collection = chroma_client.create_collection(\"quickstart\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0a2bcc07",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import VectorStoreIndex, StorageContext\n",
    "from llama_index.vector_stores import ChromaVectorStore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "68cbd239-880e-41a3-98d8-dbb3fab55431",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.schema import TextNode\n",
    "\n",
    "nodes = [\n",
    "    TextNode(\n",
    "        text=\"Michael Jordan is a retired professional basketball player, widely regarded as one of the greatest basketball players of all time.\",\n",
    "        metadata={\n",
    "            \"category\": \"Sports\",\n",
    "            \"country\": \"United States\",\n",
    "        },\n",
    "    ),\n",
    "    TextNode(\n",
    "        text=\"Angelina Jolie is an American actress, filmmaker, and humanitarian. She has received numerous awards for her acting and is known for her philanthropic work.\",\n",
    "        metadata={\n",
    "            \"category\": \"Entertainment\",\n",
    "            \"country\": \"United States\",\n",
    "        },\n",
    "    ),\n",
    "    TextNode(\n",
    "        text=\"Elon Musk is a business magnate, industrial designer, and engineer. He is the founder, CEO, and lead designer of SpaceX, Tesla, Inc., Neuralink, and The Boring Company.\",\n",
    "        metadata={\n",
    "            \"category\": \"Business\",\n",
    "            \"country\": \"United States\",\n",
    "        },\n",
    "    ),\n",
    "    TextNode(\n",
    "        text=\"Rihanna is a Barbadian singer, actress, and businesswoman. She has achieved significant success in the music industry and is known for her versatile musical style.\",\n",
    "        metadata={\n",
    "            \"category\": \"Music\",\n",
    "            \"country\": \"Barbados\",\n",
    "        },\n",
    "    ),\n",
    "    TextNode(\n",
    "        text=\"Cristiano Ronaldo is a Portuguese professional footballer who is considered one of the greatest football players of all time. He has won numerous awards and set multiple records during his career.\",\n",
    "        metadata={\n",
    "            \"category\": \"Sports\",\n",
    "            \"country\": \"Portugal\",\n",
    "        },\n",
    "    ),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ba1558b3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "35369eda",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 211 tokens\n",
      "> [build_index_from_nodes] Total embedding token usage: 211 tokens\n",
      "> [build_index_from_nodes] Total embedding token usage: 211 tokens\n"
     ]
    }
   ],
   "source": [
    "index = VectorStoreIndex(nodes, storage_context=storage_context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "bedbb693-725f-478f-be26-fa7180ea38b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.indices.vector_store.retrievers import VectorIndexAutoRetriever\n",
    "from llama_index.vector_stores.types import MetadataInfo, VectorStoreInfo\n",
    "\n",
    "\n",
    "vector_store_info = VectorStoreInfo(\n",
    "    content_info=\"brief biography of celebrities\",\n",
    "    metadata_info=[\n",
    "        MetadataInfo(\n",
    "            name=\"category\",\n",
    "            type=\"str\",\n",
    "            description=\"Category of the celebrity, one of [Sports, Entertainment, Business, Music]\",\n",
    "        ),\n",
    "        MetadataInfo(\n",
    "            name=\"country\",\n",
    "            type=\"str\",\n",
    "            description=\"Country of the celebrity, one of [United States, Barbados, Portugal]\",\n",
    "        ),\n",
    "    ],\n",
    ")\n",
    "retriever = VectorIndexAutoRetriever(index, vector_store_info=vector_store_info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "eeb18e9c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using query str: celebrities\n",
      "Using query str: celebrities\n",
      "Using query str: celebrities\n",
      "INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using filters: {'country': 'United States'}\n",
      "Using filters: {'country': 'United States'}\n",
      "Using filters: {'country': 'United States'}\n",
      "INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using top_k: 2\n",
      "Using top_k: 2\n",
      "Using top_k: 2\n",
      "INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens\n",
      "> [retrieve] Total LLM token usage: 0 tokens\n",
      "> [retrieve] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 3 tokens\n",
      "> [retrieve] Total embedding token usage: 3 tokens\n",
      "> [retrieve] Total embedding token usage: 3 tokens\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[NodeWithScore(node=Node(text='Angelina Jolie is an American actress, filmmaker, and humanitarian. She has received numerous awards for her acting and is known for her philanthropic work.', doc_id='7389c8ad-2feb-4cf3-a8da-6c0f08c0f222', embedding=None, doc_hash='1171ef7bb1b89283a1012fecb0ea7a831a0e38c2ed6fac9fbfdd62ad64063934', extra_info={'category': 'Entertainment', 'country': 'United States'}, node_info={}, relationships={}), score=0.3262841090927262),\n",
       " NodeWithScore(node=Node(text='Michael Jordan is a retired professional basketball player, widely regarded as one of the greatest basketball players of all time.', doc_id='da13fc89-72cb-401e-8445-9b9680372fbf', embedding=None, doc_hash='44c17458239bdba3c72f8ed6ac12e096b4c7965f6b5154d74b7a10484dad16a4', extra_info={'category': 'Sports', 'country': 'United States'}, node_info={}, relationships={}), score=0.3734403491142674)]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "retriever.retrieve(\"Tell me about two celebrities from United States\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "51f00cde",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using query str: Sports celebrities\n",
      "Using query str: Sports celebrities\n",
      "Using query str: Sports celebrities\n",
      "INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using filters: {'category': 'Sports', 'country': 'United States'}\n",
      "Using filters: {'category': 'Sports', 'country': 'United States'}\n",
      "Using filters: {'category': 'Sports', 'country': 'United States'}\n",
      "INFO:llama_index.indices.vector_store.auto_retriever.auto_retriever:Using top_k: 2\n",
      "Using top_k: 2\n",
      "Using top_k: 2\n",
      "INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens\n",
      "> [retrieve] Total LLM token usage: 0 tokens\n",
      "> [retrieve] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 2 tokens\n",
      "> [retrieve] Total embedding token usage: 2 tokens\n",
      "> [retrieve] Total embedding token usage: 2 tokens\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[NodeWithScore(node=Node(text='Michael Jordan is a retired professional basketball player, widely regarded as one of the greatest basketball players of all time.', doc_id='da13fc89-72cb-401e-8445-9b9680372fbf', embedding=None, doc_hash='44c17458239bdba3c72f8ed6ac12e096b4c7965f6b5154d74b7a10484dad16a4', extra_info={'category': 'Sports', 'country': 'United States'}, node_info={}, relationships={}), score=0.3328886457329614)]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "retriever.retrieve(\"Tell me about Sports celebrities from United States\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8387a0e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "vscode": {
   "interpreter": {
    "hash": "0ac390d292208ca2380c85f5bce7ded36a7a25670a97c40b8009630eb36cb06e"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
